Prevalence of Mental Health
Data Distribution
## data preprocessing
# gender info
temp <- mental[, grep('gender', colnames(mental))]
temp[! toupper(str_sub(temp, 1,1)) %in% c('F', 'M','W') ] <- 'Others'
temp[ toupper(str_sub(temp, 1,1)) %in% c('F', 'W') ] <- 'Female'
temp[ toupper(str_sub(temp, 1,1)) %in% c('M') ] <- 'Male'
gender <- temp
# age
age <- mental[, grep('What.is.your.age.', colnames(mental))]
breaks <- c(10*c(1:7))
age_category <- cut(as.numeric(age), breaks,include.lowest = TRUE, right = FALSE, dig.lab=10)
# condition
condition <- mental[grep('condition', colnames(mental))]
#colnames(condition)
condition.diagnosed <- condition$Have.you.been.diagnosed.with.a.mental.health.condition.by.a.medical.professional.
condition.type <- condition$If.so..what.condition.s..were.you.diagnosed.with.
temp <- str_split(condition.type, '[|]', simplify = T)
# data for plot
plotdata <- data.frame(cbind(gender = gender, age = age, age_category = as.character(age_category),
condition.diagnosed = condition.diagnosed,temp))
plotdata_long <- melt(plotdata, id.vars = c('gender', 'age','age_category', 'condition.diagnosed'))
#colnames(plotdata_long)
## age distribution
plotdata=plotdata[-which(plotdata$age=="323"),]
box_gender <- ggplot(plotdata) +
geom_boxplot(aes(x = gender,y =age, fill = gender, group = gender)) +
scale_y_discrete(breaks = seq(19, 99, 10)) +
scale_fill_manual(values = c("#E41A1C", "#449B75", "#AC5782")) +
#scale_fill_manual(values=col) +
theme_bw()+ggtitle("Age Distribution by Sex")+xlab("Gender")+ylab("Age")+theme(plot.title = element_text(hjust = 0.5))
box_gender

The age of woman mainly varies from 28 to 40, similar to that, the age of man mainly varies from 29 to 39 , both with a median age of 33.
## condition.diagnosed
library(ggpubr)
figure=ggpubr::ggarrange(
ggplot(plotdata) +
geom_bar(aes(x = gender, fill = condition.diagnosed), position = 'dodge') +
scale_fill_manual(values = c("#449B75", "#E41A1C")) +
theme_bw()+ylab("Number of People")+xlab("Gender") +
labs(fill="Have you been diagnosed with a mental health \n condition by a medical professional?") ,
ggplot(plotdata) +
geom_bar(aes(x = age_category, fill = condition.diagnosed), position = 'dodge') +
scale_fill_manual(values = c("#449B75", "#E41A1C")) +
#scale_fill_manual(palette="Set1") +
theme_bw()+ylab("Number of People")+xlab("Age")+
theme(axis.text.x = element_text(angle = 0)), common.legend = T, legend = "bottom")
annotate_figure(figure,
top = text_grob("Condition Diagnosed by Sex and Age",face="bold"))

For female, the number of people who have been diagnosed with some types of mental health disorders is about twice as the number of people who haven’t been diagnosed. And for male, that number tends to distribute equally. Across all age categories, the number of people who have been diagnosed with conditions is somehow more than the number of people who haven’t. In general, mental health problems are prevalent and ubiquitous within the tech industry.
Mental Disorder Type
## condition plot
col <- colorRampPalette(brewer.pal(9, 'Set1'))(6)
plotdata_long <- plotdata_long %>%
filter(! value == '')
a=unique(plotdata_long$value);a
# [1] "Anxiety Disorder (Generalized, Social, Phobia, etc)"
# [2] "Mood Disorder (Depression, Bipolar Disorder, etc)"
# [3] "Personality Disorder (Borderline, Antisocial, Paranoid, etc)"
# [4] "Attention Deficit Hyperactivity Disorder"
# [5] "Seasonal Affective Disorder"
# [6] "Depression"
# [7] "Substance Use Disorder"
# [8] "Obsessive-Compulsive Disorder"
# [9] "Post-traumatic Stress Disorder"
# [10] "Psychotic Disorder (Schizophrenia, Schizoaffective, etc)"
# [11] "Eating Disorder (Anorexia, Bulimia, etc)"
# [12] "Autism - while not a \"mental illness\", still greatly affects how I handle anxiety"
# [13] "Stress Response Syndromes"
# [14] "attention deficit disorder (but not the hyperactive version)"
# [15] "Asperger Syndrome"
# [16] "ADD (w/o Hyperactivity)"
# [17] "Addictive Disorder"
# [18] "Schizotypal Personality Disorder"
# [19] "PDD-NOS (see above)"
# [20] "Suicidal Ideation"
# [21] "Attention Deficit Disorder"
# [22] "Intimate Disorder"
# [23] "Dissociative Disorder"
# [24] "Aspergers"
# [25] "Autism"
for(i in 1:length(a)){
if (i != 1 & i !=2 & i!=4 & i!=8 & i!=9) {
plotdata_long[which(plotdata_long$value==a[i]),"value"]<-"others"
}
}
ggplot(plotdata_long) +
geom_bar(aes(x = age_category, fill = value),position = 'stack') +
facet_wrap(~gender) +
scale_fill_manual(values=col) +
theme_bw() +
theme(legend.title = element_blank(),
legend.position = 'bottom',
legend.text = element_text(size = 9),
axis.text.x = element_text(angle = 90))+
guides(fill = guide_legend(ncol = 2,
keywidth = unit(4,'mm'),
keyheight = unit(4,'mm')))+ylab("Number of People")+ggtitle("Specific Mental Health Disorders by Sex and Age")+theme(plot.title = element_text(hjust = 0.5))

Anxiety Disorder and Mood Disorder are the two most prevalent mental health disorders across all ages for both male and female within the tech industry.
Maps
Overview
In this part, we would like to show a statistics among different states in the US. To ensure that the analysis is accurate, we exclude the states whose sample size is less than 5. These states are shown as NA in the map.
df <- mental %>%
dplyr::select('Do.you.currently.have.a.mental.health.disorder.','Have.you.ever.sought.treatment.for.a.mental.health.issue.from.a.mental.health.professional.', 'What.US.state.or.territory.do.you.live.in.','Do.you.have.medical.coverage..private.insurance.or.state.provided..which.includes.treatment.of..mental.health.issues.','Does.your.employer.provide.mental.health.benefits.as.part.of.healthcare.coverage.')
names(df) <- c('disorder1','treatment','state','mental_insurance','employer_benefit1')
df <- df %>%
mutate(disorder = case_when(
disorder1 == 'Yes' ~ 1,
disorder1 == 'Maybe' ~ 0.5,
disorder1 == 'No' ~ 0))
df <- df %>%
mutate(employer_benefit = case_when(
employer_benefit1 == 'Yes' ~ 1,
TRUE ~ 0))
df <- df %>%
dplyr::select('state','treatment','employer_benefit','disorder')
#head(df)
df1 <- df %>%
dplyr::select('state','treatment')
df_treatment1 <- df1 %>% dplyr::group_by(state) %>% dplyr::summarise(count=n())
df_treatment2 <- df1 %>% filter(treatment==1) %>% dplyr::group_by(state) %>% dplyr::summarise(count1=n())
df_treatment <- df_treatment1 %>%
dplyr::left_join(df_treatment2, by = "state") %>%
dplyr::mutate(treatment_prec = count1/count) %>%
dplyr::filter(count>4)
#df_treatment <- df1 %>% group_by(state) %>% summarise(treatment_perc = sum('treatment'== 1))
#df1 <- group_by(df, state) %>% mutate(disorder_percent = disorder/sum(disorder))
#head(df_treatment1)
#sum(df_treatment1$count)
df2 <- df %>%
dplyr::select('state','employer_benefit')
df_employer_benefit1 <- df2 %>% dplyr::group_by(state) %>% dplyr::summarise(count=n())
df_employer_benefit2 <- df2 %>% dplyr::filter(employer_benefit==1) %>% dplyr::group_by(state) %>% dplyr::summarise(count1=n())
df_employer_benefit <- df_employer_benefit1 %>%
dplyr::left_join(df_employer_benefit2, by = "state") %>%
dplyr::mutate(employer_benefit_prec = count1/count) %>%
dplyr::filter(count>4)
df_final <- merge(x = df_employer_benefit, y = df_treatment, by = "state", all = TRUE) %>%
dplyr::select('state','treatment_prec','employer_benefit_prec')
#df_final$treatment_prec <- scales::percent(df_final$treatment_prec)
#df_final$employer_benefit_prec <- scales::percent(df_final$employer_benefit_prec)
#df_final$treatment_prec <- round(df_final$treatment_prec, digits = 2)
#df_final$employer_benefit_prec <- round(df_final$employer_benefit_prec, digits = 2)
round_df <- function(x, digits) {
numeric_columns <- sapply(x, mode) == 'numeric'
x[numeric_columns] <- round(x[numeric_columns], digits)
x
}
df_final <- round_df(df_final, 3)
# From https://www.census.gov/geo/maps-data/data/cbf/cbf_state.html
states <- shapefile("cb_2018_us_state_20m.shp")
combined <- states@data %>%
left_join(df_final, by = c(NAME = "state"))
states@data <- combined
Frequency of Employee Who Sought Mental Health Treatment
bins <- c(0, 0.2, 0.4, 0.6, 0.8, 1)
pal <- colorBin("YlOrRd", domain = states$treatment_prec, bins = bins, right=TRUE)
labels_states <- paste( states$NAME,
"Employee Sougnt Treatment Frequency:", states$treatment_prec)
m <- leaflet() %>%
setView(-96, 37.8, 4) %>%
addProviderTiles("MapBox", options = providerTileOptions(
id = "mapbox.light",
accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN'))) %>%
addPolygons(data=states,
fillColor = ~pal(treatment_prec),
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = labels_states,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto")) %>%
addLegend(data=states, pal = pal, title = "Frequency of Employee Who Sought Treatment", values = ~treatment_prec, opacity = 0.7,
position = "bottomright")
m
Among all the states, Iowa, North Carolina, New Jersy and Connecticut have the highest scores (>0.8), while Kansas, Missouri and Arizona have the lowest scores (<0.2). For those states with high scores, the percentage of survey participants who have sought mental health is higher, suggesting that the overall mental health situation is worse.
Frequency of Employer with Mental Health Benefit
bins <- c(0, 0.2, 0.4, 0.6, 0.8, 1)
pal <- colorBin("YlOrRd", domain = states$employer_benefit_prec, bins = bins)
labels_states <- paste( states$NAME,
"Employer with Mental Healthcare Benefit Frequency:", states$employer_benefit_prec)
n <- leaflet() %>%
setView(-96, 37.8, 4) %>%
addProviderTiles("MapBox", options = providerTileOptions(
id = "mapbox.light",
accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN'))) %>%
addPolygons(data=states,
fillColor = ~pal(employer_benefit_prec),
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = labels_states,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto")) %>%
addLegend(data=states, pal = pal, title = "Frequency of Employer with Mental Healthcare Benefit", values = ~employer_benefit_prec, opacity = 0.7,
position = "bottomright")
n
In this plot, we would like to show the percentage of employers who provide mental healthcare benefit. Compared to the previous plot, none of the states scores higher than 0.8 in this analysis, meaning that the overall mental healthcare benefit is not good enough. Among all the states, Iowa is still in the highest range, suggesting that although its mental health benefit is good, the mental health is still not as optimal. Besides, Michigan, Pennsylvania, Tennessee and Missouri also do well in mental healthcare. New Jersey scores the lowest in the US in mental healthcare, while its employee mental health situation is also among the worst.
Text Analysis
Overview
In this section, we want to further explore people’s attitudes towards mental health in workplace. We did text analysis using participants’ answer for would you bring up a mental health issue with a potential employer in an interview, why and why not.
#import data
mental_health <- read_csv("mental-heath-in-tech-2016_20161114.csv")
head(mental_health[38])
names(mental_health)[37:38] <- c("menissue_interview","text")
mental_health <- filter(mental_health, !is.na(text))
menissue_interview <- as.data.frame(table(mental_health$menissue_interview))
names(menissue_interview) <- c("Would you bring up a mental health issue with a potential employer in an interview?", "Frequency")
menissue_interview
Wordcloud for Each Answer
#answer = maybe
#create corpus
mental_health_maybe <- filter(mental_health, menissue_interview == "Maybe")
mental_health_maybe$doc_id <- as.character(c(1:nrow(mental_health_maybe)))
mental_health_maybe <- mental_health_maybe[, c(62,38)]
maybe_corpus <- DataframeSource(mental_health_maybe) %>% VCorpus(.)
#clean corpus
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, c("mental", "health", "interview", "feel", "bring", "want", "made", "get", "employer", "hire", "need", "know","sure", "may", "affect", "job", stopwords("en")))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
return(corpus)
}
maybe_clean <- clean_corpus(maybe_corpus)
#stem and stem completion
stemCompletion2 <- function(x, dictionary) {
x <- unlist(strsplit(as.character(x), " "))
x <- x[x != ""]
x <- stemCompletion(x, dictionary=dictionary)
x <- paste(x, sep="", collapse=" ")
stripWhitespace(x)
}
maybe_stemmed <- tm_map(maybe_clean, stemDocument)
maybe_compl <- lapply(maybe_stemmed, stemCompletion2, dictionary = maybe_clean) %>% VectorSource() %>% Corpus()
#word cloud
maybe_tdm <- TermDocumentMatrix(maybe_compl)
maybe_tf_idf <- tidy(maybe_tdm) %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
set.seed(1000)
wordcloud2(maybe_tf_idf[, c(1,4)], color = "random-dark", shape = "diamond")
The word cloud above shows the most common words used by participants answering maybe. They use uncertain words like I don’t know and unsure a lot, which is consistent with their answer.
#answer = yes
#create corpus and clean
mental_health_yes <- filter(mental_health, menissue_interview == "Yes")
mental_health_yes$doc_id <- as.character(c(1:nrow(mental_health_yes)))
mental_health_yes <- mental_health_yes[, c(62,38)]
yes_corpus <- DataframeSource(mental_health_yes) %>% VCorpus(.)
yes_clean <- clean_corpus(yes_corpus)
#stem and stem completion
yes_stemmed <- tm_map(yes_clean, stemDocument)
yes_compl <- lapply(yes_stemmed, stemCompletion2, dictionary = yes_clean) %>% VectorSource() %>% Corpus()
#word cloud
yes_tdm <- TermDocumentMatrix(yes_compl)
yes_tf_idf <- tidy(yes_tdm) %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
set.seed(1000)
wordcloud2(yes_tf_idf[, c(1,4)], color = "random-dark", shape = "diamond")
The word cloud above shows the most common words used by participants answering yes. Words like affects, effect and relevant show that they think the issue is important. Although positive words like advovate are used, they also use words like unfortunate to show their worries.
#answer = no
#create corpus and clean
mental_health_no <- filter(mental_health, menissue_interview == "No")
mental_health_no$doc_id <- as.character(c(1:nrow(mental_health_no)))
mental_health_no <- mental_health_no[, c(62,38)]
no_corpus <- DataframeSource(mental_health_no) %>% VCorpus(.)
no_clean <- clean_corpus(no_corpus)
#stem and stem completion
no_stemmed <- tm_map(no_clean, stemDocument)
no_compl <- lapply(no_stemmed, stemCompletion2, dictionary = no_clean) %>% VectorSource() %>% Corpus()
#word cloud
no_tdm <- TermDocumentMatrix(no_compl)
no_tf_idf <- tidy(no_tdm) %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
set.seed(1000)
wordcloud2(no_tf_idf[, c(1,4)], color = "random-dark", shape = "diamond")
The word cloud above shows the most common words used by participants answering no. They tend to use negative words in answer, such as lose, taboo, and cost.
Word Frequency for Each Answer
#top 10 words in maybe
b1 <- maybe_tf_idf %>% dplyr::group_by(term) %>%
dplyr::summarize("frequency"=sum(count)) %>% slice_max(frequency, n=10) %>%
ggplot(aes(reorder(term, frequency), frequency)) +
geom_bar(stat = "identity", fill="#AC5782") + coord_flip() +
ggtitle("Answer = Maybe") +
theme(axis.title.y = element_blank(),
axis.title.x = element_blank(),
panel.background = element_blank(),
plot.title = element_text(face = "bold", color = "black", size = 10))
#top 10 words in yes
b2 <- yes_tf_idf %>% dplyr::group_by(term) %>%
dplyr::summarize("frequency"=sum(count)) %>% slice_max(frequency, n=10) %>%
ggplot(aes(reorder(term, frequency), frequency)) +
geom_bar(stat = "identity", fill="#E41A1C") + coord_flip() +
ggtitle("Answer = Yes") +
theme(axis.title.y = element_blank(),
axis.title.x = element_blank(),
panel.background = element_blank(),
plot.title = element_text(face = "bold", color = "black", size = 10))
#top 10 words in no
b3 <- no_tf_idf %>% dplyr::group_by(term) %>%
dplyr::summarize("frequency"=sum(count)) %>% slice_max(frequency, n=10) %>%
ggplot(aes(reorder(term, frequency), frequency)) +
geom_bar(stat = "identity", fill = "#449B75") + coord_flip() +
ggtitle("Answer = No") +
theme(axis.title.y = element_blank(),
axis.title.x = element_blank(),
panel.background = element_blank(),
plot.title = element_text(face = "bold", color = "black", size = 10))
b1+b3+b2+plot_annotation('Top 10 Most Frequent Words for Each Answer')

Participants who answer maybe tend to use uncertain words like depends and discuss, but they also use some negative words such as stigma and wouldn’t. Those who answer yes tend to use positive words like important and good while those answering no use negative words such as stigma, don’t, wouldn’t and negatively.
Sentiment Analysis for Each Answer
#import Hu & Liu Dictionary
pos <- read.table("positive-words.txt", as.is=T)
neg <- read.table("negative-words.txt", as.is=T)
#define sentiment fuction
sentiment <- function(words){
tok <- quanteda::tokens(words)
pos.count <- sum(tok[[1]]%in%pos[,1])
neg.count <- sum(tok[[1]]%in%neg[,1])
out <- (pos.count - neg.count)/(pos.count+neg.count)
return(out)
}
#calculate the sentiment
mental_health$sentiment <- sapply(mental_health$text, sentiment)
#plot the relationship between sentiment and answer
p1 <- ggplot(mental_health, aes(x = menissue_interview, y = sentiment)) +
geom_boxplot(aes(fill=menissue_interview)) +
stat_summary(mapping=aes(group=menissue_interview),fun="mean",geom="point",shape=23,size=3,fill="white") +
scale_fill_manual(values = c("#AC5782", "#449B75", "#E41A1C")) +
labs(title = "Distribution of Sentiment Score by Answer", y = "Sentiment Score") +
theme(legend.position = 'none',
axis.title.x = element_blank(),
axis.title.y = element_text(vjust = 2),
panel.background = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank(),
plot.title = element_text(face = "bold", color = "black", size = 12))
pp1 <- ggplotly(p1, tooltip = "sentiment")
pp1